{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 125,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from sklearn.model_selection import KFold\n",
    "from sklearn.metrics import roc_auc_score\n",
    "from lightgbm import LGBMClassifier\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "import gc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 126,
   "metadata": {},
   "outputs": [],
   "source": [
    "os.chdir('/home/luke/Desktop/kaggle/Home_Credit_Default_Risk')\n",
    "application_test = pd.read_csv('application_test.csv')\n",
    "application_train = pd.read_csv('application_train.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 127,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "3 columns were label encoded.\n"
     ]
    }
   ],
   "source": [
    "le  = LabelEncoder()\n",
    "le_count = 0\n",
    "\n",
    "for col in application_train:\n",
    "    if application_train[col].dtype == 'object':\n",
    "        if len(list(application_train[col].unique())) <= 2:\n",
    "            le.fit(application_train[col])\n",
    "            application_train[col] = le.transform(application_train[col])\n",
    "            application_test[col] = le.transform(application_test[col])\n",
    "            \n",
    "            le_count += 1\n",
    "print('%d columns were label encoded.' % le_count)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 128,
   "metadata": {},
   "outputs": [],
   "source": [
    "application_train = pd.get_dummies(application_train)\n",
    "application_test = pd.get_dummies(application_test)\n",
    "train_labels = application_train['TARGET']\n",
    "application_train,application_test = application_train.align(application_test,join='inner',axis = 1)\n",
    "application_train['TARGET'] = train_labels\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 129,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Make a new dataframe for polynomial features\n",
    "poly_features = application_train[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH', 'TARGET']]\n",
    "poly_features_test = application_test[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH']]\n",
    "\n",
    "# imputer for handling missing values\n",
    "from sklearn.preprocessing import Imputer\n",
    "imputer = Imputer(strategy = 'median')\n",
    "\n",
    "poly_target = poly_features['TARGET']\n",
    "\n",
    "poly_features = poly_features.drop(columns = ['TARGET'])\n",
    "\n",
    "# Need to impute missing values\n",
    "poly_features = imputer.fit_transform(poly_features)\n",
    "poly_features_test = imputer.transform(poly_features_test)\n",
    "\n",
    "from sklearn.preprocessing import PolynomialFeatures\n",
    "                                  \n",
    "# Create the polynomial object with specified degree\n",
    "poly_transformer = PolynomialFeatures(degree = 3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 130,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Polynomial Features shape:  (307511, 35)\n"
     ]
    }
   ],
   "source": [
    "poly_transformer.fit(poly_features)\n",
    "\n",
    "# Transform the features\n",
    "poly_features = poly_transformer.transform(poly_features)\n",
    "poly_features_test = poly_transformer.transform(poly_features_test)\n",
    "print('Polynomial Features shape: ', poly_features.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 131,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['1',\n",
       " 'EXT_SOURCE_1',\n",
       " 'EXT_SOURCE_2',\n",
       " 'EXT_SOURCE_3',\n",
       " 'DAYS_BIRTH',\n",
       " 'EXT_SOURCE_1^2',\n",
       " 'EXT_SOURCE_1 EXT_SOURCE_2',\n",
       " 'EXT_SOURCE_1 EXT_SOURCE_3',\n",
       " 'EXT_SOURCE_1 DAYS_BIRTH',\n",
       " 'EXT_SOURCE_2^2',\n",
       " 'EXT_SOURCE_2 EXT_SOURCE_3',\n",
       " 'EXT_SOURCE_2 DAYS_BIRTH',\n",
       " 'EXT_SOURCE_3^2',\n",
       " 'EXT_SOURCE_3 DAYS_BIRTH',\n",
       " 'DAYS_BIRTH^2']"
      ]
     },
     "execution_count": 131,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "poly_transformer.get_feature_names(input_features = ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH'])[:15]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 132,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "EXT_SOURCE_2 EXT_SOURCE_3                -0.193939\n",
      "EXT_SOURCE_1 EXT_SOURCE_2 EXT_SOURCE_3   -0.189605\n",
      "EXT_SOURCE_2^2 EXT_SOURCE_3              -0.176428\n",
      "EXT_SOURCE_2 EXT_SOURCE_3^2              -0.172282\n",
      "EXT_SOURCE_1 EXT_SOURCE_2                -0.166625\n",
      "EXT_SOURCE_1 EXT_SOURCE_3                -0.164065\n",
      "EXT_SOURCE_2                             -0.160295\n",
      "EXT_SOURCE_1 EXT_SOURCE_2^2              -0.156867\n",
      "EXT_SOURCE_3                             -0.155892\n",
      "EXT_SOURCE_1 EXT_SOURCE_3^2              -0.150822\n",
      "Name: TARGET, dtype: float64\n",
      "EXT_SOURCE_1 EXT_SOURCE_2 DAYS_BIRTH    0.155891\n",
      "EXT_SOURCE_2 DAYS_BIRTH                 0.156873\n",
      "EXT_SOURCE_2 EXT_SOURCE_3 DAYS_BIRTH    0.181283\n",
      "TARGET                                  1.000000\n",
      "1                                            NaN\n",
      "Name: TARGET, dtype: float64\n"
     ]
    }
   ],
   "source": [
    "# Create a dataframe of the features \n",
    "poly_features = pd.DataFrame(poly_features, \n",
    "                             columns = poly_transformer.get_feature_names(['EXT_SOURCE_1', 'EXT_SOURCE_2', \n",
    "                                                                           'EXT_SOURCE_3', 'DAYS_BIRTH']))\n",
    "\n",
    "# Add in the target\n",
    "poly_features['TARGET'] = poly_target\n",
    "\n",
    "# Find the correlations with the target\n",
    "poly_corrs = poly_features.corr()['TARGET'].sort_values()\n",
    "\n",
    "# Display most negative and most positive\n",
    "print(poly_corrs.head(10))\n",
    "print(poly_corrs.tail(5))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 133,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training data with polynomial features shape:  (307511, 274)\n",
      "Testing data with polynomial features shape:   (48744, 274)\n"
     ]
    }
   ],
   "source": [
    "# Put test features into dataframe\n",
    "poly_features_test = pd.DataFrame(poly_features_test, \n",
    "                                  columns = poly_transformer.get_feature_names(['EXT_SOURCE_1', 'EXT_SOURCE_2', \n",
    "                                                                                'EXT_SOURCE_3', 'DAYS_BIRTH']))\n",
    "\n",
    "# Merge polynomial features into training dataframe\n",
    "poly_features['SK_ID_CURR'] = application_train['SK_ID_CURR']\n",
    "app_train_poly = application_train.merge(poly_features, on = 'SK_ID_CURR', how = 'left')\n",
    "\n",
    "# Merge polnomial features into testing dataframe\n",
    "poly_features_test['SK_ID_CURR'] = application_test['SK_ID_CURR']\n",
    "app_test_poly = application_test.merge(poly_features_test, on = 'SK_ID_CURR', how = 'left')\n",
    "\n",
    "# Align the dataframes\n",
    "app_train_poly, app_test_poly = app_train_poly.align(app_test_poly, join = 'inner', axis = 1)\n",
    "\n",
    "# Print out the new shapes\n",
    "print('Training data with polynomial features shape: ', app_train_poly.shape)\n",
    "print('Testing data with polynomial features shape:  ', app_test_poly.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 134,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training data shape:  (307511, 239)\n",
      "Testing data shape:  (48744, 239)\n"
     ]
    }
   ],
   "source": [
    "from sklearn.preprocessing import MinMaxScaler, Imputer\n",
    "\n",
    "# Drop the target from the training data\n",
    "if 'TARGET' in application_train:\n",
    "    train = application_train.drop(columns = ['TARGET'])\n",
    "else:\n",
    "    train = application_train.copy()\n",
    "features = list(train.columns)\n",
    "\n",
    "# Copy of the testing data\n",
    "test = application_test.copy()\n",
    "\n",
    "# Median imputation of missing values\n",
    "imputer = Imputer(strategy = 'median')\n",
    "\n",
    "# Scale each feature to 0-1\n",
    "scaler = MinMaxScaler(feature_range = (0, 1))\n",
    "\n",
    "# Fit on the training data\n",
    "imputer.fit(train)\n",
    "\n",
    "# Transform both training and testing data\n",
    "train = imputer.transform(train)\n",
    "test = imputer.transform(application_test)\n",
    "\n",
    "# Repeat with the scaler\n",
    "scaler.fit(train)\n",
    "train = scaler.transform(train)\n",
    "test = scaler.transform(test)\n",
    "\n",
    "print('Training data shape: ', train.shape)\n",
    "print('Testing data shape: ', test.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 135,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fold 1 AUC : 0.755236\n",
      "Fold 2 AUC : 0.757236\n",
      "Fold 3 AUC : 0.762423\n",
      "Fold 4 AUC : 0.757265\n",
      "Fold 5 AUC : 0.758051\n"
     ]
    }
   ],
   "source": [
    "# Format the training and testing data \n",
    "train = np.array(application_train.drop(columns = 'TARGET'))\n",
    "test = np.array(application_test)\n",
    "\n",
    "train_labels = np.array(train_labels).reshape((-1, ))\n",
    "\n",
    "# 10 fold cross validation\n",
    "folds = KFold(n_splits=5, shuffle=True, random_state=50)\n",
    "\n",
    "# Validation and test predictions\n",
    "valid_preds = np.zeros(train.shape[0])\n",
    "test_preds = np.zeros(test.shape[0])\n",
    "\n",
    "# Iterate through each fold\n",
    "for n_fold, (train_indices, valid_indices) in enumerate(folds.split(train)):\n",
    "    # Training data for the fold\n",
    "    train_fold, train_fold_labels = train[train_indices, :], train_labels[train_indices]\n",
    "    \n",
    "    # Validation data for the fold\n",
    "    valid_fold, valid_fold_labels = train[valid_indices, :], train_labels[valid_indices]\n",
    "    \n",
    "    # LightGBM classifier with hyperparameters\n",
    "    clf = LGBMClassifier(\n",
    "        n_estimators=10000,\n",
    "        learning_rate=0.1,\n",
    "        subsample=.8,\n",
    "        max_depth=7,\n",
    "        reg_alpha=.1,\n",
    "        reg_lambda=.1,\n",
    "        min_split_gain=.01,\n",
    "        min_child_weight=2\n",
    "    )\n",
    "    \n",
    "    # Fit on the training data, evaluate on the validation data\n",
    "    clf.fit(train_fold, train_fold_labels, \n",
    "            eval_set= [(train_fold, train_fold_labels), (valid_fold, valid_fold_labels)], \n",
    "            eval_metric='auc', early_stopping_rounds=100, verbose = False\n",
    "           )\n",
    "    \n",
    "    # Validation preditions\n",
    "    valid_preds[valid_indices] = clf.predict_proba(valid_fold, num_iteration=clf.best_iteration_)[:, 1]\n",
    "    \n",
    "    # Testing predictions\n",
    "    test_preds += clf.predict_proba(test, num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits\n",
    "    \n",
    "    # Display the performance for the current fold\n",
    "    print('Fold %d AUC : %0.6f' % (n_fold + 1, roc_auc_score(valid_fold_labels, valid_preds[valid_indices])))\n",
    "    \n",
    "    # Delete variables to free up memory\n",
    "    del clf, train_fold, train_fold_labels, valid_fold, valid_fold_labels\n",
    "    gc.collect()\n",
    "    \n",
    "\n",
    "# Make a submission dataframe\n",
    "#submission = application_test[['SK_ID_CURR']]\n",
    "#submission['TARGET'] = test_preds\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from keras import models\n",
    "from keras import layers\n",
    "from keras import regularizers\n",
    "from keras.wrappers.scikit_learn import KerasClassifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "metadata": {},
   "outputs": [],
   "source": [
    "x_val = pd.read_csv('x_val.csv')\n",
    "y_val = pd.read_csv('y_val.csv')\n",
    "partial_y_train = pd.read_csv('partial_y_train.csv')\n",
    "partial_x_train = pd.read_csv('partial_x_train.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def test(unit,epochs,batch_size):\n",
    "    model = models.Sequential()\n",
    "    model.add(layers.Dense(64, input_shape=(239,)))\n",
    "    model.add(layers.Dense(64, activation='relu'))\n",
    "    model.add(layers.Dense(1, activation='sigmoid'))\n",
    "    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n",
    "    model.fit(partial_x_train,partial_y_train, epochs=20, verbose=0,batch_size=100,validation_data=(x_val, y_val))\n",
    "    y_pred = model.predict_proba(x_test_use)\n",
    "    return roc_auc_score(y_val, model.predict_proba(x_val))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "# Function to create model, required for KerasClassifier\n",
    "def create_model(optimizer, init,reg):\n",
    "    model = models.Sequential()\n",
    "    model.add(layers.Dense(64, kernel_initializer=init, kernel_regularizer=regularizers.l2(reg),input_shape=(239,)))\n",
    "    model.add(layers.Dense(64, kernel_initializer=init, kernel_regularizer=regularizers.l2(reg),activation='relu'))\n",
    "    model.add(layers.Dense(1, kernel_initializer=init, activation='sigmoid'))\n",
    "    model.compile(loss='binary_crossentropy',optimizer=optimizer, metrics=['accuracy'])\n",
    "    return(model)\n",
    "model = KerasClassifier(build_fn=create_model, verbose=0)\n",
    "\n",
    "# grid search epochs, batch size and optimizer\n",
    "optimizers = ['rmsprop', 'adam']\n",
    "init = ['normal']\n",
    "reg = [0.01,0.05,0.1,0.5]\n",
    "epochs = [20,25]\n",
    "batches = [200,400,500]\n",
    "\n",
    "param_grid = dict(optimizer = optimizers, init = init,epochs = epochs,batch_size = batches,reg =reg)\n",
    "grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring='roc_auc')\n",
    "grid_result = grid.fit(partial_x_train, partial_y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"Best: %f using %s\" % (grid_result.best_score_, grid_result.best_params_))\n",
    "means = grid_result.cv_results_['mean_test_score']\n",
    "stds = grid_result.cv_results_['std_test_score']\n",
    "params = grid_result.cv_results_['params']\n",
    "for mean, stdev, param in zip(means, stds, params):\n",
    "\tprint(\"%f (%f) with: %r\" % (mean, stdev, param))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "x_test = pd.read_csv('x_test.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "x_test_use = np.array(x_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(48743, 239)"
      ]
     },
     "execution_count": 79,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x_test_use.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 122,
   "metadata": {},
   "outputs": [],
   "source": [
    "roc_auc_score(y_val, model.predict_proba(x_val))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
